R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

rm(list = ls(all=TRUE)) # clear global environment
library(tidyr) # data loading
library(dplyr) #dataframe manipulation
library(stringr) # string manipulation
# library(lubridate, warn.conflicts = FALSE) # work with date formats
library(ggplot2) # plotting

options(scipen = 999) # number formatting

path <- file.path("C:","Users","Paulina-laptop","Desktop","RRR")

setwd(path)
sprintf("Current directory: %s", getwd())
## [1] "Current directory: C:/Users/Paulina-laptop/Desktop/RRR"

Data loading & preparation

Let’s load some data first

wells <- read.csv(unzip("BCOGC_data/prod_csv.zip", "zone_prd_2016_to_present.csv"), skip = 1)

head(wells)
##   Wa_num Compltn_event_seq Prod_period              UWI Area_code Formtn_code
## 1     29                 0      201601 100142108318W600      3600        6200
## 2     29                 0      201603 100142108318W600      3600        6200
## 3     29                 0      201611 100142108318W600      3600        6200
## 4     29                 0      201612 100142108318W600      3600        6200
## 5     29                 0      201701 100142108318W600      3600        6200
## 6     29                 0      201702 100142108318W600      3600        6200
##   Pool_seq Gas_prod_vol..e3m3. Oil_prod_vol..m3. Water_prod_vol..m3.
## 1        A               107.7                 0                 2.3
## 2        A                45.2                 0                 1.7
## 3        A               166.2                 0                 5.0
## 4        A               159.2                 0                 2.6
## 5        A               133.5                 0                 3.7
## 6        A                95.8                 0                 4.1
##   Cond_prod_vol..m3. Prod_days. Gas_prod_cum..e3m3. Oil_prod_cum..m3.
## 1                0.5       31.0               107.7                 0
## 2                0.3       13.0               152.9                 0
## 3                1.3       22.0               319.1                 0
## 4                0.5       31.0               478.3                 0
## 5                0.6       31.0               611.8                 0
## 6                0.5       27.8               707.6                 0
##   Water_prod_cum..m3. Cond_prod_cum..m3. Project_code
## 1                 2.3                0.5            0
## 2                 4.0                0.8            0
## 3                 9.0                2.1            0
## 4                11.6                2.6            0
## 5                15.3                3.2            0
## 6                19.4                3.7            0

Column names in the original file have many strange interpunction, which could be easily fixed:

names(wells) <- str_replace_all(names(wells), c(" " = "" , 
                                                "," = "",
                                                "\\("="_",
                                                "\\)"="",
                                                "\\.."="_",
                                                "\\."=""
                                                ))
names(wells)
##  [1] "Wa_num"            "Compltn_event_seq" "Prod_period"      
##  [4] "UWI"               "Area_code"         "Formtn_code"      
##  [7] "Pool_seq"          "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"  
## [10] "Water_prod_vol_m3" "Cond_prod_vol_m3"  "Prod_days"        
## [13] "Gas_prod_cum_e3m3" "Oil_prod_cum_m3"   "Water_prod_cum_m3"
## [16] "Cond_prod_cum_m3"  "Project_code"
wells <- subset(wells, select=c(Wa_num, Prod_period, Prod_days, Area_code, Formtn_code, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Water_prod_vol_m3, Cond_prod_vol_m3)) # select only useful columns

area_codes <- read.table("BCOGC_data/ogc_area_codes.csv", 
                         sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(area_codes,3)
##   Area.Code    Area.Name
## 1        50       ADSETT
## 2       100      AIRPORT
## 3       200 AITKEN CREEK
area_codes <- rename(area_codes, Area_code = Area.Code, Area_name = Area.Name)

formation_codes <- read.table("BCOGC_data/ogc_formation_codes.csv", 
                              sep = ",", skip = 1, header = TRUE, stringsAsFactors = FALSE)
head(formation_codes,3)
##   Formation.Code        Formation.Name
## 1           4610 A MARKER/BASE OF LIME
## 2           2703          AITKEN CREEK
## 3           9922              ALDRIDGE
formation_codes <- rename(formation_codes, Formtn_code = Formation.Code, Formtn_name = Formation.Name)

Merge dataframes to create 2 additional columns with area and name as the wells file contains only its numerical codes.

wells <- merge(wells, area_codes, by = 'Area_code')
wells <- merge(wells, formation_codes, by = 'Formtn_code')

# we don't need them anymore in memory and dataframe
rm(area_codes, formation_codes) # remove form memory
wells <- subset(wells, select = -c(Area_code, Formtn_code)) # remove from dataframe

First glimpse on the data. Notice newly created columns, Area_name and Formtn_name at the end.

names(wells)
## [1] "Wa_num"            "Prod_period"       "Prod_days"        
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"   "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3"  "Area_name"         "Formtn_name"
str(wells)
## 'data.frame':    517518 obs. of  9 variables:
##  $ Wa_num           : int  14893 25370 25370 26962 14893 14893 25371 25371 25371 26962 ...
##  $ Prod_period      : int  201809 202003 201901 201709 201805 202001 202011 201912 201805 201907 ...
##  $ Prod_days        : num  30 30.7 30.8 23 31 31 28.5 30.4 30.4 28 ...
##  $ Gas_prod_vol_e3m3: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Oil_prod_vol_m3  : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Water_prod_vol_m3: num  91.4 1291 2528 1068 6948.7 ...
##  $ Cond_prod_vol_m3 : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Area_name        : chr  "DESAN" "PEEJAY WEST" "PEEJAY WEST" "BEAVERTAIL" ...
##  $ Formtn_name      : chr  "QUATERNARY" "QUATERNARY" "QUATERNARY" "QUATERNARY" ...
summary(wells)
##      Wa_num       Prod_period       Prod_days     Gas_prod_vol_e3m3 
##  Min.   :   29   Min.   :201601   Min.   : 0.00   Min.   :     0.0  
##  1st Qu.:15183   1st Qu.:201704   1st Qu.:25.60   1st Qu.:    38.2  
##  Median :22328   Median :201807   Median :30.00   Median :   124.8  
##  Mean   :21074   Mean   :201810   Mean   :26.08   Mean   :   555.4  
##  3rd Qu.:28257   3rd Qu.:201910   3rd Qu.:31.00   3rd Qu.:   608.5  
##  Max.   :41055   Max.   :202101   Max.   :31.10   Max.   :119042.3  
##  Oil_prod_vol_m3   Water_prod_vol_m3 Cond_prod_vol_m3   Area_name        
##  Min.   :   0.00   Min.   :    0.0   Min.   :   0.00   Length:517518     
##  1st Qu.:   0.00   1st Qu.:    0.9   1st Qu.:   0.00   Class :character  
##  Median :   0.00   Median :    5.0   Median :   0.00   Mode  :character  
##  Mean   :  10.77   Mean   :  181.4   Mean   :  19.31                     
##  3rd Qu.:   0.00   3rd Qu.:   52.6   3rd Qu.:   1.00                     
##  Max.   :7089.30   Max.   :29177.7   Max.   :7424.00                     
##  Formtn_name       
##  Length:517518     
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
length(unique(wells$Formtn_name)) # number of formations
## [1] 92
unique(wells$Formtn_name)
##  [1] "QUATERNARY"                   "CRETACEOUS"                  
##  [3] "CARDIUM SAND"                 "DOE CREEK"                   
##  [5] "DUNVEGAN"                     "BASE OF FISH SCALES"         
##  [7] "SCATTER"                      "PADDY"                       
##  [9] "PADDY- CADOTTE"               "CADOTTE"                     
## [11] "SPIRIT RIVER"                 "NOTIKEWIN"                   
## [13] "FALHER"                       "FALHER A"                    
## [15] "FALHER B"                     "FALHER C"                    
## [17] "FALHER D"                     "WILRICH"                     
## [19] "BLUESKY"                      "BASAL BLUESKY"               
## [21] "BLUESKY-GETHING"              "BLUESKY-GETHING-DETRITAL"    
## [23] "DETRITAL"                     "GETHING"                     
## [25] "LOWER GETHING"                "BASAL GETHING"               
## [27] "GETHING-BALDONNEL"            "CADOMIN"                     
## [29] "CHINKEH"                      "NIKANASSIN"                  
## [31] "DUNLEVY"                      "LOWER DUNLEVY"               
## [33] "ROCK CREEK"                   "NORDEGG"                     
## [35] "NORDEGG-BALDONNEL"            "PARDONET-BALDONNEL"          
## [37] "BALDONNEL"                    "BALDONNEL/UPPER CHARLIE LAKE"
## [39] "CHARLIE LAKE"                 "SIPHON"                      
## [41] "CECIL"                        "NANCY"                       
## [43] "FLATROCK"                     "BOUNDARY LAKE"               
## [45] "YELLOW MARKER"                "COPLIN"                      
## [47] "MICA"                         "BLUEBERRY"                   
## [49] "INGA"                         "NORTH PINE"                  
## [51] "BEAR FLAT"                    "PINGEL"                      
## [53] "LIMESTONE A BED"              "A MARKER/BASE OF LIME"       
## [55] "LOWER CHARLIE LAKE SANDS"     "ARTEX"                       
## [57] "ARTEX/HALFWAY"                "UPPER HALFWAY"               
## [59] "HALFWAY"                      "LOWER HALFWAY"               
## [61] "DOIG"                         "DOIG PHOSPHATE BEDS"         
## [63] "BLUESKY-GETHING-MONTNEY"      "LOWER CHARLIE LAKE/MONTNEY"  
## [65] "DOIG PHOSPHATE-MONTNEY"       "MONTNEY"                     
## [67] "BELLOY"                       "BELCOURT"                    
## [69] "BELCOURT-TAYLOR FLAT"         "BELLOY-KISKATINAW"           
## [71] "TAYLOR FLAT"                  "MISSISSIPPIAN"               
## [73] "KISKATINAW"                   "LOWER KISKATINAW"            
## [75] "BASAL KISKATINAW"             "UPPER DEBOLT"                
## [77] "DEBOLT"                       "ELKTON"                      
## [79] "SHUNDA"                       "PEKISKO"                     
## [81] "BANFF"                        "BESA RIVER"                  
## [83] "WABAMUN"                      "TETCHO"                      
## [85] "KAKISA"                       "JEAN MARIE"                  
## [87] "MUSKWA"                       "MUSKWA-OTTER PARK"           
## [89] "SLAVE POINT"                  "SULPHUR POINT"               
## [91] "EVIE"                         "PINE POINT"

Key dplyr functions:

filter

slave_pnt <- filter(wells, Formtn_name == "SLAVE POINT" )
unique(slave_pnt$Formtn_name)
## [1] "SLAVE POINT"
muskwa <- filter(wells, Formtn_name %in% c("MUSKWA", "MUSKWA-OTTER PARK"))
unique(muskwa$Formtn_name)
## [1] "MUSKWA"            "MUSKWA-OTTER PARK"
# same as
muskwa <- filter(wells, Formtn_name == "MUSKWA" | Formtn_name == "MUSKWA-OTTER PARK")
unique(muskwa$Formtn_name)
## [1] "MUSKWA"            "MUSKWA-OTTER PARK"

arrange - sort dataframe

Wa_num_sorted <- arrange(wells, Prod_period) # ascending
head(Wa_num_sorted)
##   Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1  14893      201601        31               0.0               0
## 2  25371      201601        31               0.0               0
## 3  26962      201601        31               0.0               0
## 4  25370      201601        31               0.0               0
## 5  17824      201601        31               0.1               0
## 6  21469      201601        31              25.5               0
##   Water_prod_vol_m3 Cond_prod_vol_m3    Area_name Formtn_name
## 1            4242.0                0        DESAN  QUATERNARY
## 2            4823.0                0  PEEJAY WEST  QUATERNARY
## 3            1368.0                0   BEAVERTAIL  QUATERNARY
## 4            2233.0                0  PEEJAY WEST  QUATERNARY
## 5               0.0                0         OJAY  CRETACEOUS
## 6               0.6                0 HIDING CREEK  CRETACEOUS
Wa_num_sorted <- arrange(wells, desc(Prod_period)) # descending
head(Wa_num_sorted)
##   Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1  25370      202101        31               0.0               0
## 2  17557      202101         3               0.0               0
## 3  23561      202101        31              58.2               0
## 4  16585      202101        31              77.1               0
## 5  14164      202101        31              32.0               0
## 6  18101      202101        31             108.5               0
##   Water_prod_vol_m3 Cond_prod_vol_m3   Area_name Formtn_name
## 1             798.0                0 PEEJAY WEST  QUATERNARY
## 2              27.6                0      HELMET  QUATERNARY
## 3               0.7                0        OJAY  CRETACEOUS
## 4               1.1                0        OJAY  CRETACEOUS
## 5               0.4                0        OJAY  CRETACEOUS
## 6               2.2                0        OJAY  CRETACEOUS
Wa_num_sorted <- arrange(wells, desc(Prod_period), Wa_num) # by multiple conditions
head(Wa_num_sorted)
##   Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1    101      202101      28.5               6.5            47.8
## 2    131      202101      31.0              10.6             0.0
## 3    152      202101      31.0               4.8            17.2
## 4    153      202101      30.7            1829.3             0.0
## 5    180      202101       0.5               1.7             0.0
## 6    244      202101      13.1              16.8             0.0
##   Water_prod_vol_m3 Cond_prod_vol_m3     Area_name   Formtn_name
## 1            1148.1              0.0 BOUNDARY LAKE BOUNDARY LAKE
## 2               1.1              0.2     NIG CREEK     BALDONNEL
## 3             487.6              0.0 BOUNDARY LAKE BOUNDARY LAKE
## 4              32.7              0.0      PARKLAND       WABAMUN
## 5               0.0              0.0           BEG       HALFWAY
## 6               0.0              0.3      STODDART        BELLOY
rm(Wa_num_sorted)

select - select by columns

names(wells)
## [1] "Wa_num"            "Prod_period"       "Prod_days"        
## [4] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"   "Water_prod_vol_m3"
## [7] "Cond_prod_vol_m3"  "Area_name"         "Formtn_name"
head(select(wells, Area_name, Formtn_name))
##     Area_name Formtn_name
## 1       DESAN  QUATERNARY
## 2 PEEJAY WEST  QUATERNARY
## 3 PEEJAY WEST  QUATERNARY
## 4  BEAVERTAIL  QUATERNARY
## 5       DESAN  QUATERNARY
## 6       DESAN  QUATERNARY
head(select(wells, Gas_prod_vol_e3m3:Cond_prod_vol_m3, Area_name, Formtn_name))
##   Gas_prod_vol_e3m3 Oil_prod_vol_m3 Water_prod_vol_m3 Cond_prod_vol_m3
## 1                 0               0              91.4                0
## 2                 0               0            1291.0                0
## 3                 0               0            2528.0                0
## 4                 0               0            1068.0                0
## 5                 0               0            6948.7                0
## 6                 0               0             109.3                0
##     Area_name Formtn_name
## 1       DESAN  QUATERNARY
## 2 PEEJAY WEST  QUATERNARY
## 3 PEEJAY WEST  QUATERNARY
## 4  BEAVERTAIL  QUATERNARY
## 5       DESAN  QUATERNARY
## 6       DESAN  QUATERNARY

The same wells have been listed multiple times as there are various production periods each year. Let’s create new column Prod_year to make more general summary.

mutate - create new columns

head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
library(stringr)

wells <- wells %>%
  mutate(Prod_year = substr(Prod_period, 1, 4),
         Prod_month = substr(Prod_period, 5, 6),
         # Prod_ym = as.factor(paste(Prod_year, Prod_month))) #to create time "points" only from available dates
         Prod_ym = paste(Prod_year, Prod_month, sep="-")) #to create time "points" only from available dates

head(wells)
##   Wa_num Prod_period Prod_days Gas_prod_vol_e3m3 Oil_prod_vol_m3
## 1  14893      201809      30.0                 0               0
## 2  25370      202003      30.7                 0               0
## 3  25370      201901      30.8                 0               0
## 4  26962      201709      23.0                 0               0
## 5  14893      201805      31.0                 0               0
## 6  14893      202001      31.0                 0               0
##   Water_prod_vol_m3 Cond_prod_vol_m3   Area_name Formtn_name Prod_year
## 1              91.4                0       DESAN  QUATERNARY      2018
## 2            1291.0                0 PEEJAY WEST  QUATERNARY      2020
## 3            2528.0                0 PEEJAY WEST  QUATERNARY      2019
## 4            1068.0                0  BEAVERTAIL  QUATERNARY      2017
## 5            6948.7                0       DESAN  QUATERNARY      2018
## 6             109.3                0       DESAN  QUATERNARY      2020
##   Prod_month Prod_ym
## 1         09 2018-09
## 2         03 2020-03
## 3         01 2019-01
## 4         09 2017-09
## 5         05 2018-05
## 6         01 2020-01
class(wells$Prod_ym)
## [1] "character"

summarize

head(wells$Prod_period)
## [1] 201809 202003 201901 201709 201805 202001
prod_days_by_area <- wells %>%
  group_by(Area_name) %>%
  summarize(
    total_prod_days = sum(Prod_days)
    )

head(prod_days_by_area)
## # A tibble: 6 x 2
##   Area_name          total_prod_days
##   <chr>                        <dbl>
## 1 ADSETT                       6385.
## 2 AIRPORT                       141.
## 3 AITKEN CREEK                 4164.
## 4 AITKEN CREEK NORTH           1885.
## 5 ALTARES                     14861.
## 6 BEAVERDAM                    6439.
prod_days_by_area %>% filter(total_prod_days > 500000) %>% 
  ggplot(aes(x = Area_name, y = total_prod_days)) + 
  geom_col() + 
  ggtitle("Total Production Days for most productive areas", ) +
  ylab("Production days") + 
  xlab("Area")

## histograms

wells %>% filter(Cond_prod_vol_m3 > 0) %>% 
  ggplot() + 
  geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>% 
  ggplot() + 
  geom_histogram(mapping = aes(Cond_prod_vol_m3))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

wells %>% filter(Cond_prod_vol_m3 > 100, Cond_prod_vol_m3 < 1000) %>% 
  ggplot() + 
  geom_histogram(mapping = aes(Cond_prod_vol_m3), bins=200) + 
  ggtitle("Distribution of condensate production per production period") + 
  xlab(expression("Condensate production per production period m"^3))

Data analysis We can check general trends in production by formation and area. Lets start analyzing by formation ## tables

# Using tables we can see how many wells we have for each formation
wells %>% group_by(Formtn_name) %>%
  summarize(
    count_wells = n_distinct(Wa_num),#count unique well numbers (multiple prod. periods)
    mean_wtr_prod = mean(Water_prod_vol_m3)
  ) %>% arrange(desc(count_wells))
## # A tibble: 92 x 3
##    Formtn_name             count_wells mean_wtr_prod
##    <chr>                         <int>         <dbl>
##  1 MONTNEY                        4745        150.  
##  2 JEAN MARIE                     1531          3.21
##  3 HALFWAY                         659         62.2 
##  4 BLUESKY                         589       2038.  
##  5 CADOMIN                         535         10.7 
##  6 NOTIKEWIN                       406          5.03
##  7 BALDONNEL                       356         62.3 
##  8 BLUESKY-GETHING-MONTNEY         354          1.34
##  9 GETHING                         330          2.33
## 10 BOUNDARY LAKE                   278        483.  
## # ... with 82 more rows
# we can find which areas / formations suspected to have more water disposal wells?
wells %>% group_by(Area_name) %>%
  summarize(
    count_wells = n_distinct(Wa_num), 
    mean_wtr_prod = mean(Water_prod_vol_m3) 
  ) %>% arrange(desc(count_wells))
## # A tibble: 169 x 3
##    Area_name        count_wells mean_wtr_prod
##    <chr>                  <int>         <dbl>
##  1 HERITAGE                2921        155.  
##  2 NORTHERN MONTNEY        1797        145.  
##  3 HELMET                   549         15.6 
##  4 DEEP BASIN               528          2.91
##  5 GUNNELL CREEK            471          2.46
##  6 BOUNDARY LAKE            313        431.  
##  7 RING                     213          1.41
##  8 HAY RIVER                211       4481.  
##  9 SIERRA                   207         27.3 
## 10 HORN RIVER               200         97.5 
## # ... with 159 more rows

scatterplots

We can make our data in the long format and have all types of hydrocarbon production in one column using gather function.

alt text here

wells %>% select(Water_prod_vol_m3, Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3) %>% 
  head(3)
##   Water_prod_vol_m3 Gas_prod_vol_e3m3 Oil_prod_vol_m3 Cond_prod_vol_m3
## 1              91.4                 0               0                0
## 2            1291.0                 0               0                0
## 3            2528.0                 0               0                0
prod_df <- wells %>% 
  gather(key = "Prod_type",
         value = "Prod_volume",
         c(Gas_prod_vol_e3m3, Oil_prod_vol_m3, Cond_prod_vol_m3)) %>% 
  select(Water_prod_vol_m3, Prod_volume, Prod_year, Prod_ym, Prod_type, Prod_days, Formtn_name)

prod_df %>% select(Water_prod_vol_m3, Prod_type) %>% head(3)
##   Water_prod_vol_m3         Prod_type
## 1              91.4 Gas_prod_vol_e3m3
## 2            1291.0 Gas_prod_vol_e3m3
## 3            2528.0 Gas_prod_vol_e3m3
unique(prod_df$Prod_type)
## [1] "Gas_prod_vol_e3m3" "Oil_prod_vol_m3"   "Cond_prod_vol_m3"
prod_df %>% 
  filter(Prod_year == 2021) %>% 
  ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
  geom_point()

# limit axis
prod_df %>% 
  filter(Prod_year == 2021) %>% 
  ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
  geom_point() + 
  xlim(0,10000) + 
  ylim(0,25000)
## Warning: Removed 45 rows containing missing values (geom_point).

prod_df %>% 
  filter(Prod_year == 2021) %>% 
  ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type)) +
  geom_point() + 
  facet_wrap(~Prod_type)

prod_df %>% 
  filter(Prod_year == 2021) %>% 
  ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
  geom_point() + 
  facet_wrap(~Prod_type, scales = "free") # scales free => not fixed limits of yaxis

# I expect that Prod_volume > 0 will indicate the SWD disposal wells, let's remove those from the plot

prod_df %>% 
  filter(Prod_year == 2021, Prod_volume > 0) %>% # Prod_colume > 0 to remove disposal wells 
  ggplot(aes(x = Water_prod_vol_m3, y = Prod_volume, colour = Prod_type, alpha = 0.3)) +
  geom_point() + 
  ggtitle("Water production per HC type") +
  facet_wrap(~Prod_type, scales = "free") +
  xlim(0,5000) 
## Warning: Removed 126 rows containing missing values (geom_point).

bar charts

wells %>% subset(Prod_year > 2000) %>% 
  group_by(Prod_ym) %>% 
  summarise(
    prod_per_period = sum(Cond_prod_vol_m3)
  ) %>% 
  ggplot(aes(x = Prod_ym, y = prod_per_period)) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  geom_col()

wells %>% 
  subset(Prod_year > 2000) %>%
  group_by(Prod_year) %>% 
  summarise(
    prod_per_year = sum(Cond_prod_vol_m3)
  ) %>% 
  ggplot(aes(x = Prod_year, y = prod_per_year)) + 
  geom_col()

class(wells$Prod_period)
## [1] "integer"

line graphs

unique(wells$Formtn_name)[1:20]
##  [1] "QUATERNARY"          "CRETACEOUS"          "CARDIUM SAND"       
##  [4] "DOE CREEK"           "DUNVEGAN"            "BASE OF FISH SCALES"
##  [7] "SCATTER"             "PADDY"               "PADDY- CADOTTE"     
## [10] "CADOTTE"             "SPIRIT RIVER"        "NOTIKEWIN"          
## [13] "FALHER"              "FALHER A"            "FALHER B"           
## [16] "FALHER C"            "FALHER D"            "WILRICH"            
## [19] "BLUESKY"             "BASAL BLUESKY"
prod_df %>%
  filter(Formtn_name == "BOUNDARY LAKE") %>% 
  group_by(Prod_ym, Formtn_name, Prod_type) %>% 
  summarise(
    total_prod = sum(Prod_volume)
    ) %>% 
  ungroup() %>% 
  ggplot(aes(x = Prod_ym, y = total_prod, group=Formtn_name, col=Prod_type)) +
  geom_line() + 
  scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01",
                              "2019-01","2020-01", "2021-01")) +
  facet_grid(Prod_type ~ ., scales = "free")
## `summarise()` has grouped output by 'Prod_ym', 'Formtn_name'. You can override using the `.groups` argument.

# all formations on one chart
prod_df %>%
  group_by(Prod_ym, Prod_type) %>% 
  summarise(
    total_prod = sum(Prod_volume)
    ) %>% 
  ungroup() %>% 
  ggplot(aes(x = Prod_ym, y = total_prod, group=1, col=Prod_type)) +
  geom_line() + 
  scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01", 
                              "2019-01","2020-01", "2021-01")) +
  facet_grid(Prod_type ~ ., scales = "free")
## `summarise()` has grouped output by 'Prod_ym'. You can override using the `.groups` argument.

We can apply simple trick to order the formations from the highest production to the lowest (using mutate function).

We save most productive formations separately, they will be useful later to plot production in time.

prod_by_formation <- wells %>% 
  group_by(Formtn_name) %>%
  summarise(
    oil_prod_total = sum(Oil_prod_vol_m3),
    gas_prod_total = sum(Gas_prod_vol_e3m3),
    cond_prod_total = sum(Cond_prod_vol_m3), 
    water_prod_total = sum(Water_prod_vol_m3)
    ) %>%
  ungroup()

most_gas <- prod_by_formation %>% 
  arrange(desc(gas_prod_total)) %>% 
  head(5) 

ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
  geom_col(fill='#ff6347') # change color to red

# trick using mutate to order the bars according to production value 
most_gas <- most_gas %>%  
  mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))

ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
  geom_col(fill='#ff6347') # change color to red

# same for oil production
most_oil <- prod_by_formation %>% 
  arrange(desc(oil_prod_total)) %>% 
  head(5) %>% 
  mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))

ggplot(most_oil, aes(x = Formtn_name, y = oil_prod_total)) +
  geom_col()

# and for condensate production
most_cond <- prod_by_formation %>% 
  arrange(desc(cond_prod_total)) %>% 
  head(5) %>% 
  mutate(Formtn_name=factor(Formtn_name, levels=Formtn_name))

ggplot(most_gas, aes(x = Formtn_name, y = gas_prod_total)) +
  geom_col(fill="#6495ed")

We can also visualize the same set of formations. To define the set, we will take 5 most productive gas, oil and condensate formations.

# define 5 most productive formations
best_formations <- unique(most_oil$Formtn_name,
                            most_gas$Formtn_name,
                            most_cond$Formtn_name)

wells %>% 
  subset(Formtn_name %in% best_formations) %>%
  ggplot(aes(x = Formtn_name, y = Oil_prod_vol_m3, fill=Formtn_name)) +
    geom_bar(stat = "summary", fun = "sum") +
  ggtitle("Total oil production by formation") +
  xlab("Formation") +
  ylab("Oil production [m3]")

prod_by_formation %>% 
  subset(Formtn_name %in% best_formations) %>%
  ggplot(aes(x = Formtn_name, y = gas_prod_total, fill=Formtn_name)) +
    geom_bar(stat = "summary", fun = "sum") + 
  ggtitle("Total gas production by formation") + 
  xlab("Formation") + 
  ylab("Gas production [e3m3]")

prod_by_formation %>% 
  subset(Formtn_name %in% best_formations) %>%
  ggplot(aes(x = Formtn_name, y = cond_prod_total, fill=Formtn_name)) +
    geom_bar(stat = "summary", fun = "sum") + 
  ggtitle("Total condensate production by formation") + 
  xlab("Formation") + 
  ylab("Condensate production [m3]")

prod_by_formation %>% 
  subset(Formtn_name %in% best_formations) %>%
  ggplot(aes(x = Formtn_name, y = water_prod_total, fill=Formtn_name)) +
    geom_bar(stat = "summary", fun = "sum") + 
  ggtitle("Total water production by formation") + 
  xlab("Formation") + 
  ylab("Water production [m3]")

FACETS - subplots according to some variable

#library(tidyverse)

ggplot(data = wells, aes(x = Prod_year)) + geom_bar()

# FILTER FROMATIONS WITH MOST WELLS
ggplot(filter(wells, Formtn_name %in% best_formations & Prod_year < 2021)) +
  geom_bar(aes(x = Formtn_name, fill=Formtn_name)) + 
  scale_fill_manual(values = c('#d0d1e6','#a6bddb','#67a9cf','#1c9099','#016c59')) + # taken e.g. from colorbrewer!
  facet_wrap(~ Prod_year) + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank(),
        axis.title.x = element_blank())

Bonus: Interactive area charts!

If we want to compare oil and gas production in one plot its useful to produce the formation of the dataset (https://www.joyofdata.de/blog/wp-content/uploads/2012/11/Clipboard16.png)

library(plotly)
## Warning: package 'plotly' was built under R version 4.0.4
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
head(prod_df)
##   Water_prod_vol_m3 Prod_volume Prod_year Prod_ym         Prod_type Prod_days
## 1              91.4           0      2018 2018-09 Gas_prod_vol_e3m3      30.0
## 2            1291.0           0      2020 2020-03 Gas_prod_vol_e3m3      30.7
## 3            2528.0           0      2019 2019-01 Gas_prod_vol_e3m3      30.8
## 4            1068.0           0      2017 2017-09 Gas_prod_vol_e3m3      23.0
## 5            6948.7           0      2018 2018-05 Gas_prod_vol_e3m3      31.0
## 6             109.3           0      2020 2020-01 Gas_prod_vol_e3m3      31.0
##   Formtn_name
## 1  QUATERNARY
## 2  QUATERNARY
## 3  QUATERNARY
## 4  QUATERNARY
## 5  QUATERNARY
## 6  QUATERNARY
prod_per_period <- prod_df %>% 
  filter(Formtn_name %in% best_formations) %>% 
  group_by(Prod_ym, Prod_type, Formtn_name) %>% 
  summarise(
    prod_per_period = sum(Prod_volume)
  )
## `summarise()` has grouped output by 'Prod_ym', 'Prod_type'. You can override using the `.groups` argument.
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Formtn_name, fill=Formtn_name)) + 
  geom_area(alpha=0.3) + 
  scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01", 
                              "2019-01","2020-01", "2021-01")) +
  facet_grid(Prod_type ~ ., scales = "free")

p <- ggplotly(p, tooltip = "text")
p
p <- ggplot(prod_per_period, aes(x=Prod_ym, y=prod_per_period, group=Prod_type, 
                                 text = paste("Formation:", Formtn_name, 
                                              "<br>Prod. type:", Prod_type), fill=Prod_type)) +
  geom_area(colour="#636363", alpha=0.3) + 
  scale_x_discrete(breaks = c("2016-01", "2017-01","2018-01", 
                              "2019-01","2020-01", "2021-01")) + 
  facet_wrap(Formtn_name ~ ., scales = "free") + 
  ylab("") + 
  xlab("")

p <- ggplotly(p, tooltip = "text")
p
# load well information (coordinates)
# load shapefiles with areas

library(sf)
## Warning: package 'sf' was built under R version 4.0.4
## Linking to GEOS 3.9.0, GDAL 3.2.1, PROJ 7.2.1
# load dplyr and ggplot2 in case this is the only chunk you want to run
library(dplyr) 
library(ggplot2) 

wells_coords <- read.csv(unzip("BCOGC_data/drill_csv.zip", "wells.csv"), skip = 1)  

head(wells_coords)#bigger file
##       Well.Surf.Loc                                               Well.Name
## 1 D- 080-H/092-G-03                    ROYAL   CITY NO. 1 D- 080-H/092-G-03
## 2 B- 050-D/082-G-01             CANADIAN   KOOTENAY NO. 1 B- 050-D/082-G-01
## 3 D- 055-A/082-G-02                   BORDER OILS   NO. 1 D- 055-A/082-G-02
## 4      13-11-081-17                           CANLIN   PINGEL  13-11-081-17
## 5 A- 065-E/093-I-16 CVE ENERGY ET AL  LONE MOUNTAIN NO. 1 A- 065-E/093-I-16
## 6 C- 048-D/103-G-05     STRATHCONA   QUEEN CHARLOTTE NO.1 C- 048-D/103-G-05
##   WA.Num Surf.Nad27.Lat Surf.Nad27.Long Surf.Nad83.Lat Surf.Nad83.Long
## 1      1       49084860       123064913       49085314       123065320
## 2      2       49020730       114294872       49035335       114497851
## 3      3       49025250       114331128       49047892       114554121
## 4      4             NA              NA       56004511       120331605
## 5      5       54530889       120254600       54530863       120255126
## 6      6       53172250       131590333       53171412       131575692
##   Surf.UTM.Zone.Num Surf.UTM83.Northng Surf.UTM83.Eastng Surf.North Surf.East
## 1                10            5443925          491629.8     -192.0     -71.3
## 2                11            5434400          682875.8         NA        NA
## 3                11            5435662          678718.6         NA        NA
## 4                10            6210173          652456.4     -201.2     221.3
## 5                10            6085099          664789.3      274.9    -285.3
## 6                 9            5908329          302312.1     -463.9    -107.9
##   Surf.Owner Ground.Elevtn Directional.Flag Surf.Ref.Corner Surf.Ref.Unit
## 1      CROWN           1.5                N              NE            80
## 2      CROWN            NA                N              NW            50
## 3      CROWN            NA                N              SW            55
## 4      CROWN         758.6                N              NW            NA
## 5      CROWN         971.6                N              SE            65
## 6      CROWN           8.3                N              NE            48
##   Surf.Ref.Block Surf.Ref.Map Surf.Ref.Lsd Surf.Ref.Sect Surf.Ref.Twp
## 1              H     092-G-03           NA            NA           NA
## 2              D     082-G-01           NA            NA           NA
## 3              A     082-G-02           NA            NA           NA
## 4                                       13            11           81
## 5              E     093-I-16           NA            NA           NA
## 6              D     103-G-05           NA            NA           NA
##   Surf.Ref.Range Surf.DLS.Exception Surf.Lsd Surf.Sect Surf.Twp Surf.Range
## 1             NA                          NA        NA       NA         NA
## 2             NA                          NA        NA       NA         NA
## 3             NA                          NA        NA       NA         NA
## 4             17                          13        11       81         17
## 5             NA                          NA        NA       NA         NA
## 6             NA                          NA        NA       NA         NA
##   Surf.Qtr.Unit Surf.NTS.Exception Surf.Unit Surf.Block Surf.Map Oper.Id
## 1             D                           80          H 092-G-03       0
## 2             B                           50          D 082-G-01       0
## 3             D                           55          A 082-G-02       0
## 4                                         NA                        1018
## 5             A                           65          E 093-I-16    1016
## 6             C                           48          D 103-G-05   11075
##   Oper.Abbrev Oper.Abbrev2 Optnl.Unit       Well.Area.Name Well.Name.Date
## 1       ROYAL                                   CITY NO. 1       19480610
## 2    CANADIAN                               KOOTENAY NO. 1       19490903
## 3 BORDER OILS                                        NO. 1       19480704
## 4      CANLIN                                       PINGEL       20171206
## 5  CVE ENERGY        ET AL             LONE MOUNTAIN NO. 1       20180607
## 6  STRATHCONA                         QUEEN CHARLOTTE NO.1       20200901
##   Special.Well.Class.Code Test.Hole
## 1                      NO         N
## 2                      NO         N
## 3                                 N
## 4                      NO         N
## 5                      NO         N
## 6                      NO         N
names(wells_coords)
##  [1] "Well.Surf.Loc"           "Well.Name"              
##  [3] "WA.Num"                  "Surf.Nad27.Lat"         
##  [5] "Surf.Nad27.Long"         "Surf.Nad83.Lat"         
##  [7] "Surf.Nad83.Long"         "Surf.UTM.Zone.Num"      
##  [9] "Surf.UTM83.Northng"      "Surf.UTM83.Eastng"      
## [11] "Surf.North"              "Surf.East"              
## [13] "Surf.Owner"              "Ground.Elevtn"          
## [15] "Directional.Flag"        "Surf.Ref.Corner"        
## [17] "Surf.Ref.Unit"           "Surf.Ref.Block"         
## [19] "Surf.Ref.Map"            "Surf.Ref.Lsd"           
## [21] "Surf.Ref.Sect"           "Surf.Ref.Twp"           
## [23] "Surf.Ref.Range"          "Surf.DLS.Exception"     
## [25] "Surf.Lsd"                "Surf.Sect"              
## [27] "Surf.Twp"                "Surf.Range"             
## [29] "Surf.Qtr.Unit"           "Surf.NTS.Exception"     
## [31] "Surf.Unit"               "Surf.Block"             
## [33] "Surf.Map"                "Oper.Id"                
## [35] "Oper.Abbrev"             "Oper.Abbrev2"           
## [37] "Optnl.Unit"              "Well.Area.Name"         
## [39] "Well.Name.Date"          "Special.Well.Class.Code"
## [41] "Test.Hole"
wells_coords <- wells_coords %>% select(Surf.UTM83.Northng, Surf.UTM83.Eastng, Surf.UTM.Zone.Num, Well.Area.Name, Oper.Abbrev, Surf.Owner)

wells_coords <- wells_coords %>% filter(complete.cases(.), Surf.UTM.Zone.Num %in% c(10,11))
unique(wells_coords$Surf.UTM.Zone.Num)
## [1] 10 11
# create 2 geodataframes with crs values according to UTM zone

wells_pts_1 <- wells_coords %>% 
  filter(Surf.UTM.Zone.Num == 10) %>% 
  st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 26910) %>% 
  st_transform(4326) # want to use latlon coords

wells_pts_2 <- wells_coords %>% 
  filter(Surf.UTM.Zone.Num == 11) %>%  
  st_as_sf(coords = c("Surf.UTM83.Eastng", "Surf.UTM83.Northng"), crs = 4326)

wells_pts <- rbind(wells_pts_1, wells_pts_2)
st_crs(wells_pts)
## Coordinate Reference System:
##   User input: EPSG:4326 
##   wkt:
## GEOGCRS["WGS 84",
##     DATUM["World Geodetic System 1984",
##         ELLIPSOID["WGS 84",6378137,298.257223563,
##             LENGTHUNIT["metre",1]]],
##     PRIMEM["Greenwich",0,
##         ANGLEUNIT["degree",0.0174532925199433]],
##     CS[ellipsoidal,2],
##         AXIS["geodetic latitude (Lat)",north,
##             ORDER[1],
##             ANGLEUNIT["degree",0.0174532925199433]],
##         AXIS["geodetic longitude (Lon)",east,
##             ORDER[2],
##             ANGLEUNIT["degree",0.0174532925199433]],
##     USAGE[
##         SCOPE["Horizontal component of 3D system."],
##         AREA["World."],
##         BBOX[-90,-180,90,180]],
##     ID["EPSG",4326]]
geoph_lines <- st_read("BCOGC_data/shapefiles/PASR_GEOPHYSICAL_LN.shp")
## Reading layer `PASR_GEOPHYSICAL_LN' from data source `C:\Users\Paulina-laptop\Desktop\RRR\BCOGC_data\shapefiles\PASR_GEOPHYSICAL_LN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 211710 features and 19 fields
## Geometry type: MULTILINESTRING
## Dimension:     XY
## Bounding box:  xmin: 820855.6 ymin: 535917.4 xmax: 1885851 ymax: 1672546
## Projected CRS: NAD83 / BC Albers
geoph_lines_dawson <- filter(geoph_lines, PROG_NAME %in% c("DAWSON 3D","DAWSON 3D EXTENSION"))
geoph_lines_dawson <- st_transform(geoph_lines_dawson, st_crs(wells_pts))

dawson_area <- st_union(geoph_lines_dawson) %>% 
  st_convex_hull()
## although coordinates are longitude/latitude, st_union assumes that they are planar
wells_dawson <- wells_pts %>%
  filter(st_intersects(x = ., y = dawson_area, sparse = FALSE))
## Warning in st_is_longlat(x): bounding box has potentially an invalid value range
## for longlat data
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
## although coordinates are longitude/latitude, st_intersects assumes that they are planar
ggplot() + 
  geom_sf(data = wells_dawson, size=1, alpha=0.2, aes(colour=Surf.Owner)) + 
  geom_sf(data = geoph_lines_dawson, size=0.1)

names(wells_dawson)
## [1] "Surf.UTM.Zone.Num" "Well.Area.Name"    "Oper.Abbrev"      
## [4] "Surf.Owner"        "geometry"
pools <- st_read("BCOGC_data/shapefiles/POOL_CONTOURS_LN.shp")
## Reading layer `POOL_CONTOURS_LN' from data source `C:\Users\Paulina-laptop\Desktop\RRR\BCOGC_data\shapefiles\POOL_CONTOURS_LN.shp' using driver `ESRI Shapefile'
## Simple feature collection with 7963 features and 12 fields
## Geometry type: MULTILINESTRING
## Dimension:     XY
## Bounding box:  xmin: 1091006 ymin: 1070402 xmax: 1386930 ymax: 1679933
## Projected CRS: NAD83 / BC Albers
pools <- st_transform(pools, st_crs(wells_dawson))
pools_dawson <- pools %>%
  filter(st_within(x = ., y = dawson_area, sparse = FALSE))
## although coordinates are longitude/latitude, st_within assumes that they are planar
ggplot() + 
  geom_sf(data = wells_dawson, alpha=0.2) + 
  geom_sf(data = geoph_lines_dawson, alpha=0.2) + 
  geom_sf(data = pools_dawson, aes(colour=FLUID_TYPE)) +
  scale_color_discrete(name = "Producing pools") +
  xlab("Longitude") + 
  ylab("Latitude")

fields <- st_read("BCOGC_data/shapefiles/FIELDS_PY.shp")
## Reading layer `FIELDS_PY' from data source `C:\Users\Paulina-laptop\Desktop\RRR\BCOGC_data\shapefiles\FIELDS_PY.shp' using driver `ESRI Shapefile'
## Simple feature collection with 570 features and 5 fields
## Geometry type: MULTIPOLYGON
## Dimension:     XY
## Bounding box:  xmin: 1079392 ymin: 1066850 xmax: 1387498 ymax: 1680540
## Projected CRS: NAD83 / BC Albers
ggplot() + 
  geom_sf(data = fields)

fields <- st_transform(fields, st_crs(wells_dawson))

# let's focus on the fields in the smaller area (crop the layer)
fields <- st_crop(fields, xmin = -121, xmax = -120, ymin = 55.7, ymax = 56)
## although coordinates are longitude/latitude, st_intersection assumes that they are planar
## Warning: attribute variables are assumed to be spatially constant throughout all
## geometries
unique(fields$FLDRNM)
##  [1] "MICA"           "PARKLAND"       "DOE"            "DAWSON CREEK"  
##  [5] "GROUNDBIRCH"    "SATURN"         "SUNSET PRAIRIE" "BRIAR RIDGE"   
##  [9] "BRASSEY"        "TOWER LAKE"     "SUNDOWN"        "SUNRISE"
ggplot() + 
  geom_sf(data = wells_dawson, alpha=0.2) +
  geom_sf(data = geoph_lines_dawson, alpha=0.2) +
  geom_sf(data = pools, aes(colour=FLUID_TYPE)) +
  geom_sf(data = fields, alpha=0.1, linetype="dashed") +
  scale_color_discrete(name = "Producing pools") +
  coord_sf(crs = 4326, xlim = c(-121,-120), ylim = c(55.7, 56)) + 
  xlab("Longitude") + 
  ylab("Latitude")